/*	This program creates a working dataset for the household survey sample centered
at the month of 50th birthday, for ages 45 to 57.*/


***** Set directories 
local dir_rawech 	"~/Dropbox/Retirement gaming/ech_analisis/BasesHoracio"
local dir_clean 	"~/Dropbox/Retirement gaming/clean"
local dir_output 	"~/Dropbox/Retirement gaming/output/dataverse"
local dir_do 		"~/Dropbox/Retirement gaming/dataverse"

local dataname "ech_clean.dta" 

clear all 


** APPEND DATA FROM ECH 1996-2016 AND KEEP RELEVANT AGE/COHORTS **

use "`dir_rawech'/p6.dta", clear
keep bc_* f79 f80 g139 g140  
destring bc_correlat, replace
rename f79 cajajub
rename f80 aportatot
rename g139 ybus_consump
rename g140 ybus_profits
g ybusiness=ybus_consump 
tempfile year6
save `year6'

forvalues i=7/8 { 
	use "`dir_rawech'/p`i'.dta", clear
	keep bc_* f86 f87 g147 g148 
	destring bc_correlat, replace
	rename f86 cajajub
	label define cajajub 1 "bps" 2 "bps y afap" 3 "policial" 4 "militar" 5 "profesional" 6 "notarial" 7 "bancaria"  8 "exterior"
	label values cajajub cajajub
	rename f87 aportatot
	rename g147 ybus_consump
	rename g148 ybus_profits
	g ybusiness=ybus_consump 
	tempfile year`i'
	save `year`i''
}

forvalues i=9/16 { 
	use "`dir_rawech'/p`i'.dta", clear
	keep bc_* f83 f84 g142 g143 
	destring bc_correlat, replace
	rename f83 cajajub
	label define cajajub 1 "bps" 2 "bps y afap" 3 "policial" 4 "militar" 5 "profesional" 6 "notarial" 7 "bancaria"  8 "exterior"
	label values cajajub cajajub
	rename f84 aportatot
	rename g142 ybus_consump
	rename g143 ybus_profits
	g ybusiness=ybus_consump 
	tempfile year`i'
	save `year`i''
}

use "`dir_rawech'/p96.dta", clear
keep bc_* pg31p pg41p pg51p 
g ybusiness = pg31p + pg41p + pg51p
g ybus_consump=.
g ybus_profits=.
drop pg31p pg41p pg51p

forvalues i=97/99 {
	append using "`dir_rawech'/p`i'.dta", keep(bc_* pg31p pg41p pg51p) 
	replace ybusiness = pg31p + pg41p + pg51p if bc_anio==19`i'
	drop pg31p pg41p pg51p
	display "year `i' appended"
}
forvalues i=0/0 { 
	append using "`dir_rawech'/p`i'.dta", keep(bc_* pg31p pg41p pg51p) 
	replace ybusiness = pg31p + pg41p + pg51p if bc_anio==200`i'
	drop pg31p pg41p pg51p
	display "year `i' appended"
}

forvalues i=1/5 {  
	append using "`dir_rawech'/p`i'.dta", keep(bc_* g2_1 g2_3) 
	replace ybus_consump=g2_1 if bc_anio==200`i'
	replace ybus_profits=g2_3 if bc_anio==200`i'
	replace ybusiness = ybus_consump if bc_anio==200`i' 
	drop g2_1 g2_3
	display "year `i' appended"
}

forvalues i=6/16 {
	append using `year`i''
	display "year `i' appended"
}

foreach var in bc_rama bc_pf081 {
	replace `var'=. if `var'<0
}

***********************

* Year and month of interview
tostring bc_anio, g(year)
tostring bc_mes, g(calmonth)
g intdate=(year+calmonth) if strlen(calmonth)==2
replace intdate=(year+"0"+calmonth) if strlen(calmonth)==1
g t=mofd(date(intdate,"YM"))
format t %tm
destring year, replace

* Family ID
egen famid=group(year bc_correlat)

* CREATE SAMPLING WEIGHTS THAT ARE BALACED ACROSS YEARS
bys year: egen totw=sum(bc_pesoan)
g weight=bc_pesoan
replace weight= bc_pesoan*(2527273/totw) if year<1998
drop totw


* COHORT AND AGE
gen cohort = bc_anio -  bc_pe3 
gen age_centered = bc_pe3 - 50
sum age_centered

run "`dir_do'/ipc.do"


*** TYPES OF EMPLOYMENT ***
g employed		= bc_pobp==2 
g unemployed	= bc_pobp>=3 & bc_pobp<=5
g inactive		= bc_pobp>=3

g empl=bc_cat2==1 
g self_empl= (bc_cat2==3 &  bc_pf081==1) | bc_cat2==4  
g public=bc_cat2==2

keep if bc_pe3>=18 // keep adults

g formal=bc_register==1 | bc_reg_disse==1 
g formal_ifempl=formal if empl==1
g formal_ifself=formal if self_empl==1

foreach X in empl self_empl {
	g sample_`X' = `X'
	g isample_`X'=`X'
}	
g underrep=aportatot==2 if year>=2006 & aportatot!=0

*** 1-digit CIIU
g ciiu1=bc_rama

*** 2-digit CIIU
cap drop aux
tostring bc_pf40, g(aux)
g ciiu2=substr(aux,1,2)
destring ciiu2, replace
drop aux

* Post-secondary education
g highered=bc_edu>12

*Number of children in family (to impute contributions)
cap gen ch18=0
replace ch18=1 if bc_pe3<18
bys famid: egen ch18hh=sum(ch18)


** NET AND GROSS EARNINGS ** 

* Get values for BFC and BPC 
destring calmonth, g(mes)
run "`dir_do'/bfc.do" 

** Net earnings from main job  
g Wnet= ybusiness if self_empl==1
replace Wnet=bc_pg11p if empl==1
replace Wnet=bc_pg21p if public==1 
replace Wnet=. if W==0

* Flat rate of contributions (to impute gross salary, and then calculate exact contributions)
g ssflat 	= 0.15 + 0.00125 + 0.03

g yf_total = .
replace yf_total = Wnet*(1/(1-ssflat))

**  Impute contribution base for self-employed // note: this is a lower bound, self-employed can opt to contribute more and if they have employees they have to contribute on the highest employee salary
g ficto=11*bfc	// minimum contribution base for self-employed workers

* Payroll tax rates 
* Social security
g ao_priv1 = 0.15 
*FRL (re-employment training fund)
g frl = 0.00125
*Health insurance  
gen ap_salud = 0.03
replace ap_salud = 0.045 if yf_total>(2.5*bpc) & ch18hh==0 & year>2007
replace ap_salud = 0.06 if yf_total>(2.5*bpc) & ch18hh>0 & year>2007
 
* Actual contribution rate, with varying health insurance contributions since 2008
g ssrate = ao_priv1 + ap_salud  + frl 

* Estimated amount of contributions
g sscontrib = yf_total * ssrate if empl==1 | public==1
replace sscontrib = ficto * ssrate if self_empl==1 // assuming self-employed contribute on the minimum contribution base (lower bound)

* Estimated gross earnings (ignoring income tax and assuming self-employed contribute on their true earnings)
g W = Wnet 
replace W = Wnet + sscontrib if formal==1

* DEFLATE NET AND GROSS EARNINGS USING CPI
replace Wnet=Wnet/(ipc*1000) 
replace W=W/(ipc*1000)



  
**** SAMPLE RESTRICTIONS  ****
 

* KEEP ONLY RELEVANT COHORTS (BASED ON AGE AND YEAR OF INT)
drop if cohort<1941 | cohort>=1971

*** Keep observations in the relevant interval around age 50 (BASED ON REPORTED AGE)
keep if age_centered>=-5 & age_centered<=7

*** Keep Jan96 to Dec16
keep if t>=432 & t< 684 

*** Drop women 
drop if bc_pe2!=1 

*** Keep people currently working formally in private sector  

g profnot= caja==5 | caja==6
g profnot_predict= bc_tipo_ocup==2 & bc_edu>=16 // predict professionals using occupation and at least 16 years of education
												// note: until 1999 this identifies very few people, probably because of differences in how occupations where coded before 2000
																								
g sample	 		= (empl==1 | self_empl==1) ///  working as private sector employee or self_employed (deletes public employees)
						& formal==1 ///	formal employment (covered by social security or health insurance)
						& bc_pf07==1  ///  only one job (drops people with multiple jobs)
						& bc_rama!=1 ///	drops agricultural sector
						& bc_rama!=4 /// drops construction workers
						& profnot==0 ///	drops people in professional and notarial pension schemes, but keeps people in armed forces, government and science who are not public employees or in those pension schemes
						if year>=2006 // pension schemes only available since 2006
replace sample		= (empl==1 | self_empl==1) ///  working as private sector employee or self_employed (deletes public employees)
						& formal==1 ///	formal employment (covered by social security or health insurance)
						& bc_pf07==1  ///  only one job (drops people with multiple jobs) 
						& bc_rama!=1 ///	drops agricultural sector
						& bc_rama!=4 /// drops construction workers
						& profnot_predict==0 ///	drops science and intellectual occupations, which have most people in professional and notarial pension schemes
						if year<2006 // pension schemes only available since 2006	
keep if sample==1 						

* Drop outliers
quietly: sum W, d
replace W=. if W<r(p1) 
replace W=. if W>r(p95)
drop if W==. 

*Dummies for descriptives
g noempl= bc_pf082==1 | bc_pf082a==1
	replace noempl=0 if empl==1
	replace noempl=. if bc_pf082a==. & bc_pf082==.
g micro= (bc_pf082==2 | bc_pf082a==2) | ((bc_pf082==1 | bc_pf082a==1) & empl==1) 
	replace micro=. if bc_pf082a==. & bc_pf082==.
g micro2= bc_pf082==3 | bc_pf082a==3
	replace micro2=. if bc_pf082a==. & bc_pf082==.
g larger=bc_pf081==2
	replace larger=. if bc_pf081==.
label var noempl "No employees"
label var micro "Firm size $<$5 workers"
label var micro2 "Firm size 5-9 workers"
label var larger "Firm size $\geq$10 workers"

*Firm size
g fsize=0 if noempl==1
replace fsize=1 if micro==1
replace fsize=2 if micro2==1 
replace fsize=3 if larger==1

g manufacturing		= ciiu1==2
g retailhospitality	= ciiu1==5 
g transportenergy	= ciiu1==6 | ciiu1==3
g services 			= ciiu1==7 | ciiu1==8

label var manufacturing 	"Manufacturing"
label var retailhospitality "Retail, Restaurants, Hotels"
label var transportenergy 	"Transport, Communications, Energy"
label var services 			"Services, Other"

g sector=1 if manufacturing==1
replace sector=2 if retailhospitality==1
replace sector=3 if transportenergy==1
replace sector=4 if services==1

foreach var in manufacturing  retailhospitality transportenergy services {
	replace `var'=. if ciiu2==.
}	

label var W "Earnings (1,000 UYP)"
label var bc_pe3 "Age"

destring year, replace

* 50-53 dummy
g post5053=age_centered>=0 & age_centered<4
label var post5053 "Age 50-53"
* Post-50 and 54 dummies
g post54=age_centered>=4
label var post54 "Age$\geq$54"
* Pre-48 
g pre48=age_centered<-1
label var pre48 "Age$\leq$48"
* Post 50
g post50=age_centered>=0 
label var post50 "Age$\geq$50"

*Interactions age trend and shifts
g aget=age_centered
gen aget_post50 = aget*post50
gen aget_post54 = aget*post54

label var aget "Age trend"
label var aget_post50 "Age$\geq$50 x Age trend"

g hours= bc_horas_1
replace hours=. if bc_horas_1<=0
replace hours=. if hours>72 // 95th percentile
label var hours "Hours of work per week"

* INTERACTIONS FOR DID
foreach var in post5053 post54 pre48 post50 aget aget_post50 {
g `var'_self_empl=`var' * self_empl
}
label var post5053_self_empl	"Self-employed x Age 50-53"
label var post54_self_empl 		"Self-employed x Age$\geq$54"
label var pre48_self_empl 		"Self-employed x Age$\leq$48"
label var post50_self_empl  	"Self-employed x Age$\geq$50"
label var aget_self_empl 		"Self-employed x Age trend"
label var aget_post50_self_empl "Self-employed x Age$\geq$50 x Age trend"

* MICRO FIRM INDICATOR (<10 EMPLOYEES)
g small= fsize<3

* Potential experience
g educa= bc_edu if  bc_edu >=0
g potexp=bc_pe3-18 if educa<12
replace potexp=bc_pe3-educa-6 if educa>=12
sum bc_pe3 if potexp==27

*Average under-reporting by industry
bys ciiu2: egen indurep=mean(underrep) 

drop if fsize==. | ciiu2==. 
drop if  hours==. 


save "`dir_clean'/`dataname'", replace














